In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.decomposition import PCA
from sklearn import manifold
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
In [2]:
df = pd.read_csv('Clenead/exploratory_data_analisis.csv')
df_outlier = pd.read_csv('Clenead/exploratory_data_analisis2.csv')
In [3]:
df
Out[3]:
customer_unique_id number_order_item review_score average_payment_order last_purchase_by_order late_delivered_days last_purchase_days
0 0000366f3b9a7992bf8c76cfdf3221e2 1 5.0 141.90 2018-05-10 10:56:27 -5 111
1 0000b849f77a49e4a4ce2b2a4ca5be3f 1 4.0 27.19 2018-05-07 11:11:27 -5 114
2 0000f46a3911fa3c0805444483337064 1 3.0 86.22 2017-03-10 21:05:03 -2 536
3 0000f6ccb0745a6a4b88665a16c9f078 1 4.0 43.62 2017-10-12 20:29:41 -12 320
4 0004aac84e0df4da2b147fca70cf8255 1 5.0 196.89 2017-11-14 19:45:42 -8 287
... ... ... ... ... ... ... ...
91455 fffcf5a5ff07b0908bd4e2dbc735a684 2 5.0 1033.71 2017-06-08 21:00:36 -27 446
91456 fffea47cd6d3cc0a88bd621562a9d061 1 4.0 84.58 2017-12-10 20:07:56 -3 261
91457 ffff371b4d645b6ecea244b27531430a 1 5.0 112.46 2017-02-07 15:49:16 -30 567
91458 ffff5962728ec6157033ef9805bacc48 1 5.0 133.69 2018-05-02 15:17:41 -14 118
91459 ffffd2657e2aad2907e67c3e9daecbeb 1 5.0 71.56 2017-05-02 20:18:45 -16 483

91460 rows × 7 columns

In [6]:
index = df.index
features =  df.select_dtypes(include=np.number).columns
X = df.select_dtypes(include=np.number)
In [7]:
# standardisation des données
scaler = StandardScaler()
In [8]:
X_scaled = scaler.fit_transform(X)
X_scaled
Out[8]:
array([[-0.28133371,  0.66078254, -0.0204294 ,  0.68020428, -0.82723195],
       [-0.28133371, -0.12096433, -0.60587711,  0.68020428, -0.8075944 ],
       [-0.28133371, -0.9027112 , -0.30460452,  0.97651857,  1.95475301],
       ...,
       [-0.28133371,  0.66078254, -0.17068292, -1.78908154,  2.15767426],
       [-0.28133371,  0.66078254, -0.06233094, -0.20873862, -0.78141102],
       [-0.28133371,  0.66078254, -0.37942506, -0.40628148,  1.60782312]])
In [9]:
X_scaled.shape
Out[9]:
(91460, 5)
In [10]:
n_components = 4
In [11]:
pca = PCA(n_components=0.90)
In [12]:
pca.fit(X_scaled)
Out[12]:
PCA(n_components=0.9)
In [13]:
pca.explained_variance_ratio_
Out[13]:
array([0.25724762, 0.21409974, 0.20090603, 0.1885338 , 0.1392128 ])
In [14]:
scree = (pca.explained_variance_ratio_*100).round(2)
scree
Out[14]:
array([25.72, 21.41, 20.09, 18.85, 13.92])
In [15]:
scree_cum = scree.cumsum().round()
scree_cum
Out[15]:
array([ 26.,  47.,  67.,  86., 100.])
In [16]:
x_list = range(1, 6)
list(x_list)
Out[16]:
[1, 2, 3, 4, 5]
In [17]:
# ce graphique représente l'inertie totale sur les 5 axes principaux

plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)
In [19]:
pca.explained_variance_
Out[19]:
array([1.28625217, 1.07051042, 1.00454115, 0.94267931, 0.69607162])
In [20]:
pcs = pca.components_
pcs
Out[20]:
array([[ 0.11881488, -0.67685151, -0.02674377,  0.69437992, -0.21184039],
       [ 0.71760948, -0.18949429, -0.41292642, -0.17106079,  0.49935812],
       [-0.03370996, -0.24431761,  0.77173819, -0.02398148,  0.58567705],
       [ 0.64449682,  0.08309642,  0.47633506, -0.18281168, -0.56338586],
       [ 0.2332534 ,  0.66284827,  0.07938277,  0.67422512,  0.21294103]])
In [21]:
pcs = pd.DataFrame(pcs)
pcs
Out[21]:
0 1 2 3 4
0 0.118815 -0.676852 -0.026744 0.694380 -0.211840
1 0.717609 -0.189494 -0.412926 -0.171061 0.499358
2 -0.033710 -0.244318 0.771738 -0.023981 0.585677
3 0.644497 0.083096 0.476335 -0.182812 -0.563386
4 0.233253 0.662848 0.079383 0.674225 0.212941
In [22]:
pcs.columns = features
pcs.index = [f"F{i}" for i in x_list]
pcs.round(2)
Out[22]:
number_order_item review_score average_payment_order late_delivered_days last_purchase_days
F1 0.12 -0.68 -0.03 0.69 -0.21
F2 0.72 -0.19 -0.41 -0.17 0.50
F3 -0.03 -0.24 0.77 -0.02 0.59
F4 0.64 0.08 0.48 -0.18 -0.56
F5 0.23 0.66 0.08 0.67 0.21
In [23]:
pcs.T
Out[23]:
F1 F2 F3 F4 F5
number_order_item 0.118815 0.717609 -0.033710 0.644497 0.233253
review_score -0.676852 -0.189494 -0.244318 0.083096 0.662848
average_payment_order -0.026744 -0.412926 0.771738 0.476335 0.079383
late_delivered_days 0.694380 -0.171061 -0.023981 -0.182812 0.674225
last_purchase_days -0.211840 0.499358 0.585677 -0.563386 0.212941
In [24]:
fig, ax = plt.subplots(figsize=(20, 6))
sns.heatmap(pcs.T, vmin=-1, vmax=1, annot=True, cmap="coolwarm", fmt="0.2f")
Out[24]:
<AxesSubplot:>
In [25]:
def cercle_corelation(pca,x,y):
    fig, ax = plt.subplots(figsize=(10, 9))
    for i in range(0, pca.components_.shape[1]):
        ax.arrow(0,
                 0,  # Start the arrow at the origin
                 pca.components_[x, i],  #0 for PC1
                 pca.components_[y, i],  #1 for PC2
                 head_width=0.07,
                 head_length=0.07, 
                 width=0.02,              )

        plt.text(pca.components_[0, i] + 0.05,
                 pca.components_[1, i] + 0.05,
                 features[i])

    # affichage des lignes horizontales et verticales
    plt.plot([-1, 1], [0, 0], color='grey', ls='--')
    plt.plot([0, 0], [-1, 1], color='grey', ls='--')


    # nom des axes, avec le pourcentage d'inertie expliqué
    plt.xlabel('F{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x],1)))
    plt.ylabel('F{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y],1)))

    plt.title("Cercle des corrélations (F{} et F{})".format(x+1, y+1))


    an = np.linspace(0, 2 * np.pi, 100)
    plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale
    plt.axis('equal')
    plt.show(block=False)
In [26]:
def correlation_graph(pca, 
                      x_y, 
                      features) : 
    """Affiche le graphe des correlations

    Positional arguments : 
    -----------------------------------
    pca : sklearn.decomposition.PCA : notre objet PCA qui a été fit
    x_y : list ou tuple : le couple x,y des plans à afficher, exemple [0,1] pour F1, F2
    features : list ou tuple : la liste des features (ie des dimensions) à représenter
    """

    # Extrait x et y 
    x,y=x_y

    # Taille de l'image (en inches)
    fig, ax = plt.subplots(figsize=(8, 7))

    # Pour chaque composante : 
    for i in range(0, pca.components_.shape[1]):

        # Les flèches
        ax.arrow(0,0, 
                pca.components_[x, i],  
                pca.components_[y, i],  
                head_width=0.07,
                head_length=0.07, 
                width=0.02, )

        # Les labels
        plt.text(pca.components_[x, i] + 0.05,
                pca.components_[y, i] + 0.05,
                features[i])
        #plt.text(pca.components_[x, i] + 0.05, pca.components_[y, i] + 0.05, fontsize='14', horizontalalignment='center',verticalalignment='center')
        
    # Affichage des lignes horizontales et verticales
    plt.plot([-1, 1], [0, 0], color='grey', ls='--')
    plt.plot([0, 0], [-1, 1], color='grey', ls='--')

    # Nom des axes, avec le pourcentage d'inertie expliqué
    plt.xlabel('F{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x],1)))
    plt.ylabel('F{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y],1)))

    # J'ai copié collé le code sans le lire
    plt.title("Cercle des corrélations (F{} et F{})".format(x+1, y+1))

    # Le cercle 
    an = np.linspace(0, 2 * np.pi, 100)
    plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale

    # Axes et display
    plt.axis('equal')
    plt.show(block=False)
In [27]:
X_proj = pca.transform(X_scaled)
print(X_proj.shape)
pcs_pi = pd.DataFrame(X_proj,columns=pcs.index)
pcs_pi[pcs_pi['F1']>1]
(91460, 5)
Out[27]:
F1 F2 F3 F4 F5
7 1.974028 0.768192 0.606183 0.701988 -1.349849
9 4.674222 -0.648644 0.184721 -0.947726 1.053813
24 1.097449 -0.436968 -0.560411 -0.420235 0.659344
30 1.313328 0.551239 0.955382 -0.592607 -1.795524
36 1.640504 0.323156 -0.765799 1.132243 -1.197903
... ... ... ... ... ...
91420 2.041700 0.990251 1.773347 -1.644348 -0.492920
91434 1.548224 0.206268 3.994929 -0.547355 -0.539620
91436 3.373446 -0.593088 -0.594117 -0.197623 -0.639665
91438 2.646996 0.115925 0.068528 0.741527 0.127113
91444 4.714721 -0.704325 0.413924 -0.919716 1.160219

13223 rows × 5 columns

In [28]:
pca.inverse_transform(X_proj).shape
Out[28]:
(91460, 5)
In [29]:
def display_factorial_planes(   X_projected, 
                                x_y, 
                                pca=None, 
                                labels = None,
                                clusters=None, 
                                alpha=1,
                                figsize=[8,7], 
                                marker="." ):
    """
    Affiche la projection des individus

    Positional arguments : 
    -------------------------------------
    X_projected : np.array, pd.DataFrame, list of list : la matrice des points projetés
    x_y : list ou tuple : le couple x,y des plans à afficher, exemple [0,1] pour F1, F2

    Optional arguments : 
    -------------------------------------
    pca : sklearn.decomposition.PCA : un objet PCA qui a été fit, cela nous permettra d'afficher la variance de chaque composante, default = None
    labels : list ou tuple : les labels des individus à projeter, default = None
    clusters : list ou tuple : la liste des clusters auquel appartient chaque individu, default = None
    alpha : float in [0,1] : paramètre de transparence, 0=100% transparent, 1=0% transparent, default = 1
    figsize : list ou tuple : couple width, height qui définit la taille de la figure en inches, default = [10,8] 
    marker : str : le type de marker utilisé pour représenter les individus, points croix etc etc, default = "."
    """

    # Transforme X_projected en np.array
    X_ = np.array(X_projected)

    # On définit la forme de la figure si elle n'a pas été donnée
    if not figsize: 
        figsize = (7,6)

    # On gère les labels
    if  labels is None : 
        labels = []
    try : 
        len(labels)
    except Exception as e : 
        raise e

    # On vérifie la variable axis 
    if not len(x_y) ==2 : 
        raise AttributeError("2 axes sont demandées")   
    if max(x_y )>= X_.shape[1] : 
        raise AttributeError("la variable axis n'est pas bonne")   

    # on définit x et y 
    x, y = x_y

    # Initialisation de la figure       
    fig, ax = plt.subplots(1, 1, figsize=figsize)

    # On vérifie s'il y a des clusters ou non
    c = None if clusters is None else clusters
 
    # Les points    
    # plt.scatter(   X_[:, x], X_[:, y], alpha=alpha, 
    #                     c=c, cmap="Set1", marker=marker)
    sns.scatterplot(data=None, x=X_[:, x], y=X_[:, y], hue=c)

    # Si la variable pca a été fournie, on peut calculer le % de variance de chaque axe 
    if pca : 
        v1 = str(round(100*pca.explained_variance_ratio_[x]))  + " %"
        v2 = str(round(100*pca.explained_variance_ratio_[y]))  + " %"
    else : 
        v1=v2= ''

    # Nom des axes, avec le pourcentage d'inertie expliqué
    ax.set_xlabel(f'F{x+1} {v1}')
    ax.set_ylabel(f'F{y+1} {v2}')

    # Valeur x max et y max
    x_max = np.abs(X_[:, x]).max() *1.1
    y_max = np.abs(X_[:, y]).max() *1.1

    # On borne x et y 
    ax.set_xlim(left=-x_max, right=x_max)
    ax.set_ylim(bottom= -y_max, top=y_max)

    # Affichage des lignes horizontales et verticales
    plt.plot([-x_max, x_max], [0, 0], color='grey', alpha=0.8)
    plt.plot([0,0], [-y_max, y_max], color='grey', alpha=0.8)

    # Affichage des labels des points
    if len(labels) : 
        # j'ai copié collé la fonction sans la lire
        for i,(_x,_y) in enumerate(X_[:,[x,y]]):
            print(labels[i])
            plt.text(_x, _y+0.05, labels[i], fontsize='14', horizontalalignment='center',verticalalignment='center') 

    # Titre et display
    plt.title(f"Projection des individus (sur F{x+1} et F{y+1})")
    plt.show()
In [30]:
# les variables energy, fat et saturated fat sont liés. En effet, plsu l'aliment est gras plus il aura de l'energier. 

x, y = 0,1

correlation_graph(pca,(x,y),features)
In [31]:
df_pca = pd.DataFrame(X_proj,columns=['F1','F2','F3','F4','F5'])
In [32]:
#X = df.select_dtypes(include = np.number)
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
#tsne = manifold.TSNE(n_components=2, perplexity=40, init='pca', learning_rate = 200.0)
#X_tsne = tsne.fit_transform(X_scaled)
In [33]:
#X_tsne
In [34]:
from sklearn.cluster import KMeans

k  = range(1,6)
inertia = []

for i in k:
    model = KMeans(n_clusters=i)
    model.fit(X_proj)
    inertia.append(model.inertia_)
    
inertia
Out[34]:
[457300.0000000003,
 376392.85208271566,
 319234.6828643871,
 271072.79091752094,
 230724.12159447552]
In [35]:
fig, ax = plt.subplots(1,1,figsize=(12,6))

ax.set_ylabel("intertia")
ax.set_xlabel("n_cluster")
ax = plt.plot(k, inertia)
In [36]:
#df['cluster'] = model.labels_
model = KMeans(n_clusters=4)
model.fit(df_pca)
Out[36]:
KMeans(n_clusters=4)
In [37]:
df_pca['cluster'] = model.labels_
df_pca
Out[37]:
F1 F2 F3 F4 F5 cluster
0 0.167429 -0.848108 -0.668526 0.205560 0.653214 0
1 0.708053 -0.448418 -0.917843 -0.149333 0.092742 0
2 0.849700 0.904028 1.116393 -1.681224 0.386478 1
3 -0.059939 0.308584 -0.046796 -0.742689 -0.079622 1
4 -0.289886 -0.338014 0.229911 -0.255644 0.721033 1
... ... ... ... ... ... ...
91455 -1.786990 -0.410982 4.140574 2.298704 0.292643 3
91456 0.633548 -0.122655 -0.132975 -0.588038 0.454082 0
91457 -2.175498 1.126871 1.022926 -1.096250 -0.387959 1
91458 -0.458421 -0.655861 -0.652709 0.322295 0.060297 0
91459 -1.093247 0.701951 0.506634 -1.138695 0.410703 1

91460 rows × 6 columns

In [38]:
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

visualizer = KElbowVisualizer(model, k=(2,8))

visualizer.fit(X_proj)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
Out[38]:
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
In [39]:
# Instantiate the clustering model and visualizer 
model = KMeans(4)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_proj)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
Out[39]:
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 91460 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
In [40]:
#df['cluster'] = model.labels_
df_clust = df.select_dtypes(include=np.number)
model = KMeans(n_clusters=4)
model.fit(df_clust)
Out[40]:
KMeans(n_clusters=4)
In [41]:
df_clust['cluster' ] = model.labels_
df_clust
Out[41]:
number_order_item review_score average_payment_order late_delivered_days last_purchase_days cluster
0 1 5.0 141.90 -5 111 1
1 1 4.0 27.19 -5 114 1
2 1 3.0 86.22 -2 536 0
3 1 4.0 43.62 -12 320 0
4 1 5.0 196.89 -8 287 0
... ... ... ... ... ... ...
91455 2 5.0 1033.71 -27 446 3
91456 1 4.0 84.58 -3 261 0
91457 1 5.0 112.46 -30 567 0
91458 1 5.0 133.69 -14 118 1
91459 1 5.0 71.56 -16 483 0

91460 rows × 6 columns

In [42]:
df_clust.cluster.value_counts()
Out[42]:
1    49270
0    36647
3     4894
2      649
Name: cluster, dtype: int64
In [43]:
df_clust.loc[df_clust.cluster == 0]
Out[43]:
number_order_item review_score average_payment_order late_delivered_days last_purchase_days cluster
2 1 3.0 86.22 -2 536 0
3 1 4.0 43.62 -12 320 0
4 1 5.0 196.89 -8 287 0
8 1 4.0 150.12 -28 542 0
10 1 3.0 29.00 -12 407 0
... ... ... ... ... ... ...
91450 1 4.0 55.00 -7 361 0
91453 1 5.0 81.20 -7 302 0
91456 1 4.0 84.58 -3 261 0
91457 1 5.0 112.46 -30 567 0
91459 1 5.0 71.56 -16 483 0

36647 rows × 6 columns

In [44]:
import plotly.express as px

df = px.data.tips()
fig = px.box(df_clust,  y="number_order_item", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les commandes clients",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [45]:
fig = px.box(df_clust,  y="review_score", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les satisfactions clients",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [46]:
fig = px.box(df_clust,  y="average_payment_order", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les dépenses clients",
             hover_data=["cluster"] # add day column to hover data
)
fig.show()
In [47]:
fig = px.box(df_clust,  y="late_delivered_days", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les retards de livraisons clients",
             hover_data=["cluster"] # add day column to hover data
)
fig.show()
In [48]:
df_clust.cluster.value_counts()
Out[48]:
1    49270
0    36647
3     4894
2      649
Name: cluster, dtype: int64
In [49]:
df_clust[df_clust.cluster == 2]
Out[49]:
number_order_item review_score average_payment_order late_delivered_days last_purchase_days cluster
128 1 5.0 1600.51 -19 149 2
286 1 5.0 2304.68 -14 33 2
411 1 5.0 4016.91 -10 528 2
432 1 5.0 1429.59 -8 166 2
564 1 5.0 1841.11 1 28 2
... ... ... ... ... ... ...
91116 1 5.0 3048.27 -20 449 2
91121 1 5.0 1672.67 -12 515 2
91193 1 5.0 6726.66 -23 461 2
91258 2 5.0 1568.72 -16 41 2
91369 1 5.0 1626.83 -23 547 2

649 rows × 6 columns

In [54]:
df_clust[df_clust.cluster == 2].average_payment_order.min(), df_clust[df_clust.cluster == 2].average_payment_order.max()
Out[54]:
(1146.2, 6929.31)
In [55]:
df_clust[df_clust.cluster == 1].average_payment_order.min(), df_clust[df_clust.cluster == 2].average_payment_order.max()
Out[55]:
(9.59, 6929.31)
In [56]:
df_clust[df_clust.cluster == 0].average_payment_order.min(), df_clust[df_clust.cluster == 2].average_payment_order.max()
Out[56]:
(9.34142857142857, 6929.31)

Kmeans 2 clusters¶

In [59]:
#df['cluster'] = model.labels_
model = KMeans(n_clusters=3)
model.fit(df_clust)
Out[59]:
KMeans(n_clusters=3)
In [60]:
df_clust.loc[:,'cluster' ] = model.labels_
df_clust
Out[60]:
number_order_item review_score average_payment_order late_delivered_days last_purchase_days cluster
0 1 5.0 141.90 -5 111 0
1 1 4.0 27.19 -5 114 0
2 1 3.0 86.22 -2 536 1
3 1 4.0 43.62 -12 320 1
4 1 5.0 196.89 -8 287 1
... ... ... ... ... ... ...
91455 2 5.0 1033.71 -27 446 2
91456 1 4.0 84.58 -3 261 1
91457 1 5.0 112.46 -30 567 1
91458 1 5.0 133.69 -14 118 0
91459 1 5.0 71.56 -16 483 1

91460 rows × 6 columns

In [61]:
df_clust.cluster.value_counts()
Out[61]:
0    50916
1    37889
2     2655
Name: cluster, dtype: int64
In [64]:
df = px.data.tips()
fig = px.box(df_clust,  y="number_order_item", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les commandes clients",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [65]:
df = px.data.tips()
fig = px.box(df_clust,  y="average_payment_order", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les dépenses clients",
             hover_data=["cluster"] # add day column to hover data
)
fig.show()
In [66]:
df = px.data.tips()
fig = px.box(df_clust,  y="late_delivered_days", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les retards de livraisons",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [67]:
df = px.data.tips()
fig = px.box(df_clust,  y="review_score", color="cluster",
             notched=True, # used notched shape
             title="Cluster sur les scores clients",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [68]:
# Instantiate the clustering model and visualizer 
model = KMeans(2)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(df_clust)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
Out[68]:
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 91460 Samples in 2 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>

Cluster avec le t_SNE¶

In [170]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn import metrics



inertie_lst = []
silhouette_lst  = []

k = 2

for row in range(3):
    print('k =',k)
    model = KMeans(n_clusters = k)
    model.fit(X_proj)
    inertie_lst.append(model.inertia_)
        
    # calcul du coefficient de silhouette 
        
    silhouette_lst.append(metrics.silhouette_score(X_tsne, model.labels_))
        
    centers = model.cluster_centers_
        
        
    k+=1
k = 2
k = 3
k = 4
In [67]:
silhouette_lst
Out[67]:
[0.111704454, 0.124344744, 0.113509364]
In [172]:
fig, ax = plt.subplots(1)
ax.set_xticks(np.arange(15))
ax.plot( silhouette_lst)
ax.set_xlabel('k', fontsize=20)
ax.set_ylabel('Silhouette', fontsize=20)

fig.set_figheight(8)
fig.set_figwidth(8)
plt.grid()

Clustering t_SNE¶

In [38]:
from sklearn.cluster import KMeans


model = KMeans(n_clusters=5)
model.fit(X_tsne)
cluster = model.predict(X_tsne) 
cluster
Out[38]:
array([4, 4, 3, ..., 0, 0, 1])
In [42]:
visualizer = KElbowVisualizer(model, k=(2,8))

visualizer.fit(X_tsne)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
Out[42]:
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
In [46]:
# Instantiate the clustering model and visualizer 
model = KMeans(4)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_tsne)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
Out[46]:
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 91460 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
In [49]:
data_x_tnse = pd.DataFrame(X_tsne)
data_x_tnse['cluster'] = cluster
data_x_tnse
Out[49]:
0 1 cluster
0 -1.135180 5.481637 4
1 -29.377144 44.306736 4
2 -11.386347 41.802914 3
3 -25.837961 13.150128 4
4 35.942589 -11.248713 2
... ... ... ...
91455 47.699753 13.797524 2
91456 -20.651836 41.465958 4
91457 9.849265 -47.742378 0
91458 7.628273 -29.965574 0
91459 -25.084793 -33.355484 1

91460 rows × 3 columns

In [52]:
data_x_tnse.columns
Out[52]:
Index([0, 1, 'cluster'], dtype='object')
In [53]:
sns.scatterplot(data = data_x_tnse, x = 0, y= 1,hue = 'cluster')
Out[53]:
<AxesSubplot:xlabel='0', ylabel='1'>
In [69]:
from sklearn.cluster import KMeans

df_clust2 = df_outlier.select_dtypes(include=np.number)

k  = range(1,5)
inertia = []


for i in k:
    model = KMeans(n_clusters=i)
    model.fit(df_clust2)
    inertia.append(model.inertia_)
    
inertia
Out[69]:
[2246741301.155181, 921199418.7926846, 611620628.081596, 492114342.4343247]
In [70]:
fig, ax = plt.subplots(1,1,figsize=(12,6))

ax.set_ylabel("intertia")
ax.set_xlabel("n_cluster")
ax = plt.plot(k, inertia)
In [80]:
#df['cluster'] = model.labels_
df_clust2 = df_outlier.select_dtypes(include=np.number)
df_clust2 = df_clust2.loc[:,['number_order_item','average_payment_order','last_purchase_days']]
model = KMeans(n_clusters=3)
model.fit(df_clust2)
Out[80]:
KMeans(n_clusters=3)
In [87]:
df_clust2['cluster' ] = model.labels_
df_clust2
Out[87]:
number_order_item average_payment_order last_purchase_days cluster
0 1 141.90 111 0
1 1 27.19 114 0
2 1 86.22 536 1
3 1 43.62 320 2
4 1 196.89 287 2
... ... ... ... ...
80217 1 73.16 254 2
80218 1 84.58 261 2
80219 1 112.46 567 1
80220 1 133.69 118 0
80221 1 71.56 483 1

80222 rows × 4 columns

In [82]:
df_clust2.cluster.value_counts()
Out[82]:
0    30682
2    30297
1    19243
Name: cluster, dtype: int64
In [88]:
import plotly.express as px

df = px.data.tips()
fig = px.box(df_clust2,  y="last_purchase_days", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [83]:
import plotly.express as px

df = px.data.tips()
fig = px.box(df_clust2,  y="number_order_item", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [86]:
fig = px.box(df_clust2,  y="average_payment_order", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)
fig.show()
In [77]:
df_outlier.describe()
Out[77]:
number_order_item review_score average_payment_order late_delivered_days last_purchase_days
count 80222.000000 80222.000000 80222.000000 80222.000000 80222.000000
mean 1.128780 4.298658 104.421378 -12.116352 238.825933
std 0.527578 1.140118 66.871755 9.571503 153.107393
min 1.000000 1.000000 6.080000 -147.000000 0.000000
25% 1.000000 4.000000 54.620000 -17.000000 114.000000
50% 1.000000 5.000000 86.501667 -13.000000 221.000000
75% 1.000000 5.000000 137.670000 -7.000000 349.000000
max 33.000000 5.000000 436.690000 188.000000 694.000000
In [91]:
# Instantiate the clustering model and visualizer 
model = KMeans(3)
visualizer = SilhouetteVisualizer(model)
df_clust2 = df_clust2.loc[:,['number_order_item','average_payment_order','last_purchase_days']]

visualizer.fit(df_clust2)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
Out[91]:
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 80222 Samples in 3 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
In [92]:
# Instantiate the clustering model and visualizer 
model = KMeans(2)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(df_clust2)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data
Out[92]:
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 80222 Samples in 2 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
In [87]:
scaler2 = StandardScaler()
df_scalled = scaler2.fit_transform(df_outlier.select_dtypes(include = np.number))
df_scalled
Out[87]:
array([[-0.24409852,  0.61515247,  0.30484942,  0.74349834],
       [-0.24409852, -0.26195525, -1.01228746,  0.74349834],
       [-0.24409852, -1.13906297, -0.33448614,  1.0569307 ],
       ...,
       [-0.24409852,  0.61515247, -0.03319007, -1.86843797],
       [-0.24409852,  0.61515247,  0.21057957, -0.19679873],
       [-0.24409852,  0.61515247, -0.50281694, -0.40575364]])
In [89]:
#df['cluster'] = model.labels_
#df_outlier[df_outlier.total_payment < ]
model = KMeans(n_clusters=2)
model.fit(df_scalled)
Out[89]:
KMeans(n_clusters=2)
In [92]:
data_scalled = pd.DataFrame(df_scalled, columns=df_outlier.select_dtypes(include = np.number).columns)
data_scalled
Out[92]:
order_id review_score total_payment late_delivered
0 -0.244099 0.615152 0.304849 0.743498
1 -0.244099 -0.261955 -1.012287 0.743498
2 -0.244099 -1.139063 -0.334486 1.056931
3 -0.244099 -0.261955 -0.823633 0.012156
4 -0.244099 0.615152 0.936262 0.430066
... ... ... ... ...
80217 -0.244099 0.615152 -0.484445 -1.137096
80218 -0.244099 -0.261955 -0.353317 0.952453
80219 -0.244099 0.615152 -0.033190 -1.868438
80220 -0.244099 0.615152 0.210580 -0.196799
80221 -0.244099 0.615152 -0.502817 -0.405754

80222 rows × 4 columns

In [93]:
data_scalled['cluster' ] = model.labels_
data_scalled
Out[93]:
order_id review_score total_payment late_delivered cluster
0 -0.244099 0.615152 0.304849 0.743498 1
1 -0.244099 -0.261955 -1.012287 0.743498 1
2 -0.244099 -1.139063 -0.334486 1.056931 0
3 -0.244099 -0.261955 -0.823633 0.012156 1
4 -0.244099 0.615152 0.936262 0.430066 1
... ... ... ... ... ...
80217 -0.244099 0.615152 -0.484445 -1.137096 1
80218 -0.244099 -0.261955 -0.353317 0.952453 1
80219 -0.244099 0.615152 -0.033190 -1.868438 1
80220 -0.244099 0.615152 0.210580 -0.196799 1
80221 -0.244099 0.615152 -0.502817 -0.405754 1

80222 rows × 5 columns

In [94]:
data_scalled.cluster.value_counts()
Out[94]:
1    67327
0    12895
Name: cluster, dtype: int64
In [78]:
d.cluster.value_counts()
Out[78]:
0    61982
1    18240
Name: cluster, dtype: int64
In [79]:
import plotly.express as px

df = px.data.tips()
fig = px.box(df_clust2,  y="order_id", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [80]:
fig = px.box(df_clust2,  y="review_score", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [81]:
fig = px.box(df_clust2,  y="total_payment", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)
fig.show()
In [97]:
import plotly.express as px

df = px.data.tips()
fig = px.box(data_scalled,  y="order_id", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [98]:
fig = px.box(data_scalled,  y="review_score", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)

fig.show()
In [99]:
fig = px.box(data_scalled,  y="total_payment", color="cluster",
             notched=True, # used notched shape
             title="Box plot cluster",
             hover_data=["cluster"] # add day column to hover data
)
fig.show()
In [102]:
data_scalled.total_payment.max(),data_scalled.total_payment.min()
Out[102]:
(40.06662947607055, -1.2546792312474864)